Multivariate Data


Reading in Data and Loading Libraries

pf <- read.csv("pseudo_facebook.tsv", sep = "\t")
library(ggplot2)
library(gridExtra)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)

Third Qualitative Variable

p1 <- ggplot(aes(x = gender, y = age),
       data = subset(pf, !is.na(gender))) + geom_boxplot() +
  stat_summary(fun.y = mean, geom = "point", shape = 4)

p2 <- ggplot(aes(x = age, y = friend_count),
       data = subset(pf, !is.na(gender))) +
  geom_line(aes(color = gender), stat = "summary", fun.y = median)

grid.arrange(p1, p2, ncol = 1)

Create a new data frame, that contains information on each age and gender group.

pf.fc_by_age_gender <- pf %>% 
  filter(!is.na(gender)) %>% 
  group_by(age, gender) %>% 
  summarise(mean_friend_count = mean(friend_count),
            median_friend_count = median(friend_count),
            n= n()) %>%
  arrange(age)

head(pf.fc_by_age_gender)
## Source: local data frame [6 x 5]
## Groups: age [3]
## 
## # A tibble: 6 x 5
##     age gender mean_friend_count median_friend_count     n
##   <int> <fctr>             <dbl>               <dbl> <int>
## 1    13 female          259.1606               148.0   193
## 2    13   male          102.1340                55.0   291
## 3    14 female          362.4286               224.0   847
## 4    14   male          164.1456                92.5  1078
## 5    15 female          538.6813               276.0  1139
## 6    15   male          200.6658               106.5  1478

Plotting Conditional Summaries

ggplot(aes(x = age, y = median_friend_count),
       data = pf.fc_by_age_gender) +
  geom_line(aes(color = gender))


Thinking in Ratios


Wide and Long Format

Notes: Use of reshape2 or tidyr (easier)


Reshaping Data

pf.fc_by_age_gender.wide = pf.fc_by_age_gender %>%
  select(age, gender, median_friend_count) %>%
  filter(!is.na(gender)) %>% 
  spread(gender, median_friend_count) %>% 
  mutate(ratio = female / male)

head(pf.fc_by_age_gender.wide)
## Source: local data frame [6 x 4]
## Groups: age [6]
## 
## # A tibble: 6 x 4
##     age female  male    ratio
##   <int>  <dbl> <dbl>    <dbl>
## 1    13  148.0  55.0 2.690909
## 2    14  224.0  92.5 2.421622
## 3    15  276.0 106.5 2.591549
## 4    16  258.5 136.0 1.900735
## 5    17  245.5 125.0 1.964000
## 6    18  243.0 122.0 1.991803

Ratio Plot

ggplot(aes(x = age, y = ratio), data = pf.fc_by_age_gender.wide) +
  geom_line() +
  geom_hline(yintercept = 1, alpha = 0.3, linetype = 2)


Third Quantitative Variable

pf$year_joined = floor(2014 - pf$tenure / 365)

Cut a Variable

summary(pf$year_joined)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    2005    2012    2012    2012    2013    2014       2
table(pf$year_joined)
## 
##  2005  2006  2007  2008  2009  2010  2011  2012  2013  2014 
##     9    15   581  1507  4557  5448  9860 33366 43588    70
pf$year_joined.bucket <- cut(pf$year_joined,
                             c(2004, 2009, 2011, 2012, 2014))
table(pf$year_joined.bucket, useNA = "ifany")
## 
## (2004,2009] (2009,2011] (2011,2012] (2012,2014]        <NA> 
##        6669       15308       33366       43658           2

Plotting it All Together

Notes: Create a line graph of friend_count vs. age so that each year_joined.bucket is a line tracking the median user friend_count across age.

ggplot(aes(x = age, y = friend_count),
       data = subset(pf, !is.na(year_joined.bucket))) +
  geom_line(aes(color = year_joined.bucket),
            stat = "summary", fun.y = median)


Plot the Grand Mean

Notes: Plotting the grand mean is a good reminder that much of the data in the sample is about members of recent cohorts.

ggplot(aes(x = age, y = friend_count),
       data = subset(pf, !is.na(year_joined.bucket))) +
  geom_line(aes(color = year_joined.bucket),
            stat = "summary", fun.y = mean) +
  geom_line(stat = "summary", fun.y = mean, linetype = 2)


Friending Rate

Notes: Max 417 is definetely an outlier since 3rd quantile is only about 0.57.

with(subset(pf, tenure >= 1), summary(friend_count / tenure))
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   0.0000   0.0775   0.2205   0.6096   0.5658 417.0000

Friendships Initiated

Notes: Create a line graph of mean of friendships_initiated per day (of tenure) vs. tenure colored by year_joined.bucket.

What is the median friend rate? 0.2205

What is the maximum friend rate? 417

ggplot(aes(x = tenure, y = friendships_initiated / tenure),
       data = subset(pf, tenure >= 1)) +
  geom_line(aes(color = year_joined.bucket),
            stat = "summary", fun.y = mean)

Notes: Users with more tenure typically initiate less friendships.


Bias-Variance Tradeoff Revisited

Notes:

p3 = ggplot(aes(x = tenure, y = friendships_initiated / tenure),
       data = subset(pf, tenure >= 1)) +
  geom_line(aes(color = year_joined.bucket),
            stat = 'summary',
            fun.y = mean)

p4 = ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
       data = subset(pf, tenure > 0)) +
  geom_line(aes(color = year_joined.bucket),
            stat = "summary",
            fun.y = mean)

p5 = ggplot(aes(x = 30 * round(tenure / 30), y = friendships_initiated / tenure),
       data = subset(pf, tenure > 0)) +
  geom_line(aes(color = year_joined.bucket),
            stat = "summary",
            fun.y = mean)

p6 = ggplot(aes(x = 90 * round(tenure / 90), y = friendships_initiated / tenure),
       data = subset(pf, tenure > 0)) +
  geom_line(aes(color = year_joined.bucket),
            stat = "summary",
            fun.y = mean)

p7 = ggplot(aes(x = tenure, y = friendships_initiated / tenure),
       data = subset(pf, tenure > 0)) +
  geom_smooth(aes(color = year_joined.bucket))

grid.arrange(p3, p4, p5, p6, p7, ncol = 1)
## `geom_smooth()` using method = 'gam'


Sean’s NFL Fan Sentiment Study

Notes: Bias Variance Trade Off


Introducing the Yogurt Data Set


Histograms Revisited

yo = read.csv("yogurt.csv")

# Change the id from an int to a factor
yo$id = factor(yo$id)
str(yo)
## 'data.frame':    2380 obs. of  9 variables:
##  $ obs        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ id         : Factor w/ 332 levels "2100081","2100370",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ time       : int  9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
##  $ strawberry : int  0 0 0 0 1 1 0 0 0 0 ...
##  $ blueberry  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pina.colada: int  0 0 0 0 1 2 0 0 0 0 ...
##  $ plain      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mixed.berry: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ price      : num  59 59 65 65 49 ...
p8 = ggplot(aes(price), data = yo) +
  geom_histogram(fill = "orange")

p9 = ggplot(aes(price), data = yo) +
  geom_histogram(fill = "orange", binwidth = 10)

grid.arrange(p8, p9, ncol = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

We notice * Important discreetness * For this very discreet data, this histogram is a very biased model.


Number of Purchases

Notes: See discreetness with summary or by length and unique. The 3rd quantile is about the same with the maximum.

summary(yo)
##       obs               id            time         strawberry     
##  Min.   :   1.0   2132290:  74   Min.   : 9662   Min.   : 0.0000  
##  1st Qu.: 696.5   2130583:  59   1st Qu.: 9843   1st Qu.: 0.0000  
##  Median :1369.5   2124073:  50   Median :10045   Median : 0.0000  
##  Mean   :1367.8   2149500:  50   Mean   :10050   Mean   : 0.6492  
##  3rd Qu.:2044.2   2101790:  47   3rd Qu.:10255   3rd Qu.: 1.0000  
##  Max.   :2743.0   2129528:  39   Max.   :10459   Max.   :11.0000  
##                   (Other):2061                                    
##    blueberry        pina.colada          plain         mixed.berry    
##  Min.   : 0.0000   Min.   : 0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 0.0000   1st Qu.: 0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median : 0.0000   Median : 0.0000   Median :0.0000   Median :0.0000  
##  Mean   : 0.3571   Mean   : 0.3584   Mean   :0.2176   Mean   :0.3887  
##  3rd Qu.: 0.0000   3rd Qu.: 0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :12.0000   Max.   :10.0000   Max.   :6.0000   Max.   :8.0000  
##                                                                       
##      price      
##  Min.   :20.00  
##  1st Qu.:50.00  
##  Median :65.04  
##  Mean   :59.25  
##  3rd Qu.:68.96  
##  Max.   :68.96  
## 
unique(yo$price)
##  [1] 58.96 65.04 48.96 68.96 39.04 24.96 50.00 45.04 33.04 44.00 33.36
## [12] 55.04 62.00 20.00 49.60 49.52 33.28 63.04 33.20 33.52
length(unique(yo$price))
## [1] 20
table(yo$price)
## 
##    20 24.96 33.04  33.2 33.28 33.36 33.52 39.04    44 45.04 48.96 49.52 
##     2    11    54     1     1    22     1   234    21    11    81     1 
##  49.6    50 55.04 58.96    62 63.04 65.04 68.96 
##     1   205     6   303    15     2   799   609
yo = transform(yo, all.purchases = strawberry + blueberry +
                 pina.colada + plain + mixed.berry)
summary(yo$all.purchases)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   1.971   2.000  21.000

Prices over Time

Notes: The modal or most common prices seem to be increasing over time. We also see some lower prices scattered about the graph. These may be due to sales or buyers using coupons that bring the prices down.

ggplot(aes(all.purchases), data = yo) +
  geom_histogram(fill = "cyan", binwidth = 1)

ggplot(aes(time, price), data = yo) +
  geom_jitter(alpha = 1/4, shape = 21, fill = "brown")


Sampling Observations

Notes: Take a closer look applying sub-sampling.


Looking at Samples of Households

Notes: x %in% y returns a logical (boolean) vector the same length as x that says whether each entry in x appears in y. That is, for each entry in x, it checks to see whether it is in y. Use the pch or shape parameter to specify the symbol when plotting points. Using the size parameter, we add more detail to the plot.

# Set the seed for reproducible results
set.seed(4230)
sample.ids = sample(levels(yo$id), 16)

ggplot(aes(time, price), data = subset(yo, id %in% sample.ids)) +
  facet_wrap(~ id) +
  geom_line() +
  geom_point(aes(size = all.purchases), pch = 1)


The Limits of Cross Sectional Data

Notes: The general idea is that we have observations over time, we can face by the primary unit, case or individual in the data set. For yogurt data, it was the households we were faceting over. The same plot cannot be generated with the pseudo Facebook data set, since we don’t have data on our sample of users over time.


Many Variables


Scatterplot Matrix

Notes: If the plot takes a long time to render or if you want to see some of the scatterplot matrix, then only examine a smaller number of variables. You can use the following code or select fewer variables. We recommend including gender (the 6th variable)!

pf_subset <- pf[ , c(2:7)]

You may also find that variable labels are on the outer edges of the scatterplot matrix, rather than on the diagonal. If you want labels in the diagonal, you can set the axisLabels = ‘internal’ argument in your ggpairs command.

# install.packages('GGally')
library(GGally)
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
## 
##     nasa
theme_set(theme_minimal(10))
# Set the seed for reproducible results
set.seed(1836)

pf_subset = pf[, c(2:15)]
names(pf_subset)
##  [1] "age"                   "dob_day"              
##  [3] "dob_year"              "dob_month"            
##  [5] "gender"                "tenure"               
##  [7] "friend_count"          "friendships_initiated"
##  [9] "likes"                 "likes_received"       
## [11] "mobile_likes"          "mobile_likes_received"
## [13] "www_likes"             "www_likes_received"
ggpairs(pf_subset)
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 2 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 2 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 2 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 2 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
## Warning: Removed 2 rows containing missing values (geom_point).

## Warning: Removed 2 rows containing missing values (geom_point).

## Warning: Removed 2 rows containing missing values (geom_point).

## Warning: Removed 2 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 2 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 2 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 2 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 2 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 2 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 2 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 2 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 2 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_point).

  • Scatterplots are below the diagonal, and categorical variables, like gender, create faceted histograms.

Even More Variables

Notes: Micro-array/Gene Expression Data

nci = read.table("nci.tsv")

# Change the colnames to produce a nicer plot
colnames(nci) = c(1:64)

Heat Maps

Notes:

library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
nci.long.samp <- melt(as.matrix(nci[1:200,]))
names(nci.long.samp) <- c("gene", "case", "value")
head(nci.long.samp)
##   gene case  value
## 1    1    1  0.300
## 2    2    1  1.180
## 3    3    1  0.550
## 4    4    1  1.140
## 5    5    1 -0.265
## 6    6    1 -0.070
ggplot(aes(y = gene, x = case, fill = value),
  data = nci.long.samp) +
  geom_tile() +
  scale_fill_gradientn(colours = colorRampPalette(c("blue", "red"))(100))


Analyzing Three or More Variables

Reflection * Simple extensions to scatter plots and plots of conditional summaries * Scatter plot matrices and heat maps * Reshape data (long <—–> wide)


Price Histograms with Facet and Color

data("diamonds")
str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame':    53940 obs. of  10 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
ggplot(aes(x = price), data = diamonds) +
  geom_histogram(aes(fill = cut)) +
  facet_wrap(~ color) +
  scale_fill_brewer(palette = "Spectral") +
  scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Price vs. Table Colored by Cut

ggplot(aes(x = table, y = price, color = cut), data = diamonds) +
  geom_point() +
  scale_color_brewer(type = 'qual') +
  scale_x_continuous(limits = c(50, 80), breaks = seq(50, 80, 2))
## Warning: Removed 5 rows containing missing values (geom_point).

Price vs. Volume and Diamond Clarity

diamonds$volume = diamonds$x * diamonds$y * diamonds$z
ggplot(data = subset(diamonds, volume != 0 & diamonds$volume <= quantile(diamonds$volume, 0.99)),
       aes(x = volume, y = price)) +
  geom_point(aes(color = clarity)) +
  scale_color_brewer(type = 'div') +
  scale_y_log10()

Proportion Of Friendships Initiated

pf <- transform(pf,prop_initiated = ifelse(friend_count == 0,
                                           0, friendships_initiated/friend_count))

Prop_initiated Vs. Tenure

ggplot(aes(x = tenure, y = prop_initiated),
       data = subset(pf, tenure > 0)) +
  geom_line(aes(color = year_joined.bucket),
            stat = "summary", fun.y = median)

Smoothing prop_initiated vs. tenure

ggplot(aes(x = tenure, y = prop_initiated),
       data = subset(pf, tenure > 0)) +
  geom_smooth(aes(color = year_joined.bucket))
## `geom_smooth()` using method = 'gam'

Largest Group Mean Prop_initiated

lg_prop_init <- subset(pf, year_joined.bucket == '(2012,2014]')
summary(lg_prop_init$prop_initiated)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.5000  0.6912  0.6430  0.8438  1.0000

Price/Carat Binned, Faceted, & Colored

ggplot(data = diamonds, aes(x = cut, y = price/carat, color = color)) +
  geom_jitter(alpha = 0.3) +
  facet_wrap(~ clarity, ncol = 2) +
  scale_color_brewer(type = 'div')